Lecture 4: dplyr

dplyr (and friends)

  • dplyr makes it easy to perform data manipulation…
  • A collection of verbs (do this) helps us translate thought to code
    • mutate (create) new variables
    • select variables
    • filter observations
    • summarise values
    • arrange observations or rows
  • dplyr functions are pipeable, the data argument has the first position, and each function returns a data frame

Data in our examples

library(exscidata) # Load the data package
library(tidyverse) # tidyverse includes dplyr

glimpse(cyclingstudy) # Overview of variables in the data set
Rows: 80
Columns: 101
$ subject              <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16…
$ group                <chr> "INCR", "DECR", "INCR", "DECR", "DECR", "INCR", "…
$ timepoint            <chr> "pre", "pre", "pre", "pre", "pre", "pre", "pre", …
$ age                  <dbl> 33, 32, 39, 37, 31, 33, 42, 26, 41, 35, 34, 41, 3…
$ height.T1            <dbl> 183, 174, 193, 175, 176, 168, 180, 179, 185, 187,…
$ weight.T1            <dbl> 80.3, 71.4, 98.1, 79.2, 88.0, 79.6, 77.6, 75.5, 8…
$ sj.max               <dbl> 30.97, 31.55, 26.76, 29.23, 31.22, 34.24, 30.11, …
$ cmj.max              <dbl> 34.98, 33.85, 28.79, 30.77, 25.84, 35.27, 32.99, …
$ lac.125              <dbl> 1.50, 1.19, 1.17, 0.88, 1.06, 1.27, 0.85, 0.93, 1…
$ lac.175              <dbl> 1.86, 1.49, 1.52, 0.99, 1.41, 1.73, 0.84, 1.34, 1…
$ lac.225              <dbl> 2.38, 2.34, 1.22, 2.13, 1.90, 3.21, 1.16, 1.94, 1…
$ lac.250              <dbl> 3.54, 3.21, 1.54, 3.25, 2.04, 4.83, 1.71, NA, NA,…
$ lac.275              <dbl> 6.21, 5.33, 2.04, NA, 3.04, NA, 3.33, 3.71, 3.24,…
$ lac.300              <dbl> NA, NA, 3.32, 6.15, 3.59, NA, 6.25, 7.29, 6.21, 1…
$ lac.325              <dbl> NA, NA, 4.72, NA, 4.73, NA, NA, NA, NA, 2.60, NA,…
$ lac.350              <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, 4.69, NA, NA,…
$ lac.375              <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ VO2.125              <dbl> 2309.000, 1993.000, 2009.587, 2044.154, 2315.000,…
$ VO2.175              <dbl> 2640.000, 2583.000, 2845.729, 2676.632, 2848.000,…
$ VO2.225              <dbl> 3165.000, 3275.000, 3306.307, 3222.226, 3493.000,…
$ VO2.250              <dbl> 3794.000, 3548.000, 3617.408, 3644.501, 3735.000,…
$ VO2.275              <dbl> 4290.000, 3829.000, 4052.763, NA, 4091.000, NA, 3…
$ VO2.300              <dbl> NA, NA, 3887.403, 4250.821, 4502.000, NA, 3955.00…
$ VO2.325              <dbl> NA, NA, 4395.627, NA, 4775.000, NA, NA, NA, NA, 4…
$ VO2.350              <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, 4595, NA, NA,…
$ VO2.375              <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ VCO2.125             <dbl> 2113, 1794, 1782, 1935, 2179, 2031, 1848, 2100, 1…
$ VCO2.175             <dbl> 2487, 2531, 2572, 2593, 2685, 2490, 2175, 2582, 2…
$ VCO2.225             <dbl> 2981, 3194, 3071, 3208, 3342, 3237, 2691, 3170, 2…
$ VCO2.250             <dbl> 3657, 3471, 3448, 3718, 3574, 3588, 2980, NA, NA,…
$ VCO2.275             <dbl> 4287, 3935, 3861, NA, 3977, NA, 3431, 3799, 3630,…
$ VCO2.300             <dbl> NA, NA, 3846, 4486, 4339, NA, 3920, 4224, 4132, 3…
$ VCO2.325             <dbl> NA, NA, 4439, NA, 4773, NA, NA, NA, NA, 4076, NA,…
$ VCO2.350             <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, 4611, NA, NA,…
$ VCO2.375             <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ VE.125               <dbl> 55, 48, 50, 50, 61, 50, 50, 55, 45, 48, 37, 59, 4…
$ VE.175               <dbl> 68, 64, 65, 62, 74, 60, 63, 68, 61, 57, 51, 73, 5…
$ VE.225               <dbl> 80, 85, 79, 79, 95, 80, 75, 82, 79, 68, 68, 90, 7…
$ VE.250               <dbl> 102, 90, 95, 92, 102, 90, 90, NA, NA, 76, 82, 100…
$ VE.275               <dbl> 133, 109, 103, NA, 120, 112, 112, 100, 105, 85, 1…
$ VE.300               <dbl> NA, NA, 108, 120, 124, 138, NA, 120, 138, 98, NA,…
$ VE.325               <dbl> NA, NA, 134, NA, 150, NA, NA, NA, NA, 100, NA, NA…
$ VE.350               <chr> NA, NA, "VE", NA, NA, NA, NA, NA, NA, "117", NA, …
$ VE.375               <chr> NA, NA, "VE", NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ HF.125               <dbl> 109, 121, 114, 106, 109, 120, 115, 126, 100, 115,…
$ HF.175               <dbl> 124, 140, 121, 118, 123, 132, 127, 142, 113, 127,…
$ HF.225               <dbl> 139, 160, 135, 131, 139, 143, 147, 155, 131, 138,…
$ HF.250               <dbl> 152, 170, 146, 144, 148, 150, 159, NA, NA, 148, 1…
$ HF.275               <dbl> 167, 176, 157, NA, 159, NA, 170, 169, 148, 154, 1…
$ HF.300               <dbl> NA, NA, 168, 160, 164, NA, 176, 178, 161, 160, NA…
$ HF.325               <dbl> NA, NA, 172, NA, 170, NA, NA, NA, NA, 164, NA, NA…
$ HF.350               <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, 168, NA, NA, …
$ HF.375               <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ RPE.B.125            <dbl> 9, 11, 9, 9, 9, 7, 11, 10, 8, 10, 10, 9, 9, 8, 9,…
$ RPE.B.175            <dbl> 12, 13, 12, 11, 11, 11, 12, 12, 11, 13, 13, 11, 1…
$ RPE.B.225            <dbl> 14, 15, 13, 11, 13, 13, 13, 13, 13, 14, 14, 13, 1…
$ RPE.B.250            <dbl> 15, 17, 13, 13, 14, 15, 15, NA, NA, 15, 17, 14, 1…
$ RPE.B.275            <dbl> 16, 18, 15, NA, 15, NA, 16, 15, 15, 15, 18, 15, N…
$ RPE.B.300            <dbl> NA, NA, 15, 15, 16, NA, 18, 17, 17, 16, NA, NA, N…
$ RPE.B.325            <dbl> NA, NA, 15, NA, 17, NA, NA, NA, NA, 16, NA, NA, N…
$ RPE.B.350            <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, 17, NA, NA, N…
$ RPE.B.375            <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ RPE.L.125            <dbl> 9, 11, 7, 9, 9, 7, 11, 10, 8, 8, 5, 9, 9, 8, 9, 9…
$ RPE.L.175            <dbl> 13, 13, 11, 11, 11, 11, 13, 12, 11, 11, 10, 11, 1…
$ RPE.L.225            <dbl> 15, 15, 12, 13, 13, 13, 14, 13, 14, 12, 16, 13, 1…
$ RPE.L.250            <dbl> 15, 17, 13, 15, 15, 15, 15, NA, NA, 13, 17, 14, 1…
$ RPE.L.275            <dbl> 16, 18, 15, NA, 16, NA, 16, 15, 15, 13, 19, 15, N…
$ RPE.L.300            <dbl> NA, NA, 15, 17, 17, NA, 19, 18, 17, 14, NA, NA, N…
$ RPE.L.325            <dbl> NA, NA, 16, NA, 17, NA, NA, NA, NA, 14, NA, NA, N…
$ RPE.L.350            <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, 16, NA, NA, N…
$ RPE.L.375            <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ RPM.125              <dbl> 95, 85, 97, 90, 106, 85, 95, 99, 88, 87, 90, 102,…
$ RPM.175              <dbl> 96, 87, 97, 90, 102, 81, 97, 99, 87, 90, 92, 101,…
$ RPM.225              <dbl> 95, 90, 97, 89, 103, 81, 98, 98, 90, 91, 92, 99, …
$ RPM.250              <dbl> 100, 88, 99, 87, 102, 80, 96, NA, NA, 90, 95, 100…
$ RPM.275              <dbl> 110, 88, 102, NA, 103, NA, 95, 98, 82, 91, 92, 10…
$ RPM.300              <dbl> NA, NA, 101, 84, 101, NA, 85, 98, 90, 90, NA, NA,…
$ RPM.325              <dbl> NA, NA, 102, NA, 97, NA, NA, NA, NA, 92, NA, NA, …
$ RPM.350              <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, 93, NA, NA, N…
$ RPM.375              <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ start.load           <dbl> 200, 200, 250, 200, 250, 200, 200, 200, 200, 200,…
$ end.load             <dbl> 400, 400, 475, 425, 425, 400, 375, 425, 425, 475,…
$ tte                  <dbl> 540, 526, 570, 570, 510, 510, 480, 555, 570, 690,…
$ VO2.max              <dbl> 5629.000, 4471.000, 5597.990, 4944.024, 5748.000,…
$ VO2.diff.30sek       <dbl> 191, 90, 36, 50, 67, 17, 31, 81, 75, 287, 146, 11…
$ VO2.diff.30sek.ml_kg <dbl> 2.37858032, 1.26050420, 0.36697248, 0.63131313, 0…
$ RER.max              <dbl> 1.16, 1.19, 1.10, 1.10, 1.13, 1.18, 1.12, 110.00,…
$ VE.max               <dbl> 227, 173, 221, 166, 235, 192, 168, 170, 223, 171,…
$ HF.max               <dbl> 186, 197, 188, 178, 186, 182, 183, 192, 181, 180,…
$ HF.1min              <dbl> 155, 150, 153, 140, 124, 152, 140, NA, 136, 140, …
$ RPE.B.max            <dbl> 19, 19, 19, 19, 19, 19, 19, 20, 20, 19, 19, 19, 2…
$ RPE.L.max            <dbl> 19, 19, 20, 19, 20, 20, 20, 20, 20, 19, 20, 19, 1…
$ lac.1min             <dbl> 14.04, 10.55, 9.85, 10.96, 9.54, 15.18, 7.54, 10.…
$ lac.14min            <dbl> 6.68, 8.64, NA, 5.21, NA, 10.77, 3.33, 4.21, 7.18…
$ torque.factor        <dbl> 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8,…
$ torque               <dbl> 64.24, 56.00, 78.40, 63.36, 70.40, 63.68, 62.08, …
$ peak.power           <dbl> 1525.170, 1499.670, 1270.000, 887.140, 1594.000, …
$ mean.power           <dbl> 856.2500, 734.7100, 885.0000, 724.8200, 871.0000,…
$ fatigue              <dbl> 40.65000, 40.29000, 41.10000, 18.15000, 50.34000,…
$ total.work           <dbl> 25565.86, 21937.06, 26373.00, 21599.77, 26031.00,…
$ comment              <chr> NA, NA, "Possible error in VO2 measures 275-300W"…

select() - Select variables (columns) in a data set

cyclingstudy %>%
        select(subject, 
               group, 
               timepoint, 
               cmj.max) %>%
        print()
# A tibble: 80 × 4
   subject group timepoint cmj.max
     <dbl> <chr> <chr>       <dbl>
 1       1 INCR  pre          35.0
 2       2 DECR  pre          33.8
 3       3 INCR  pre          28.8
 4       4 DECR  pre          30.8
 5       5 DECR  pre          25.8
 6       6 INCR  pre          35.3
 7       7 MIX   pre          33.0
 8       8 MIX   pre          33.2
 9       9 MIX   pre          22.4
10      10 INCR  pre          31.1
# ℹ 70 more rows

select() - Select a range of columns with :

cyclingstudy %>%
        select(subject:age, 
               cmj.max) %>%
        print()
# A tibble: 80 × 5
   subject group timepoint   age cmj.max
     <dbl> <chr> <chr>     <dbl>   <dbl>
 1       1 INCR  pre          33    35.0
 2       2 DECR  pre          32    33.8
 3       3 INCR  pre          39    28.8
 4       4 DECR  pre          37    30.8
 5       5 DECR  pre          31    25.8
 6       6 INCR  pre          33    35.3
 7       7 MIX   pre          42    33.0
 8       8 MIX   pre          26    33.2
 9       9 MIX   pre          41    22.4
10      10 INCR  pre          35    31.1
# ℹ 70 more rows

select() - Select and rename columns

cyclingstudy %>%
 select(participant = subject, 
        group, 
        timepoint,
        cmj.max) %>%
 print()
# A tibble: 80 × 4
   participant group timepoint cmj.max
         <dbl> <chr> <chr>       <dbl>
 1           1 INCR  pre          35.0
 2           2 DECR  pre          33.8
 3           3 INCR  pre          28.8
 4           4 DECR  pre          30.8
 5           5 DECR  pre          25.8
 6           6 INCR  pre          35.3
 7           7 MIX   pre          33.0
 8           8 MIX   pre          33.2
 9           9 MIX   pre          22.4
10          10 INCR  pre          31.1
# ℹ 70 more rows

select() and selection helpers

  • last_col()/everything()
  • starts_with()
  • ends_with()
  • contains()
  • all_off()/any_off()
  • where()

Select by position with last_col() or name with everything()

  • last_col() selects the last column in a data set. Adding an offset selects the last column - n.
  • everything() selects all columns in a data set.
cyclingstudy %>%
 select(last_col()) %>%
 print()
# A tibble: 80 × 1
   comment                                                        
   <chr>                                                          
 1 <NA>                                                           
 2 <NA>                                                           
 3 Possible error in VO2 measures 275-300W                        
 4 <NA>                                                           
 5 <NA>                                                           
 6 <NA>                                                           
 7 <NA>                                                           
 8 <NA>                                                           
 9 wingate test not valid, load increased during the whole 30-sek.
10 <NA>                                                           
# ℹ 70 more rows

Select by position with last_col() or name with everything()

  • everything() selects all columns in a data set.
  • Can also be used with a list of variables (e.g. everything(vars = c("subject", "age"))
cyclingstudy %>%
 select(everything()) %>%
 print()
# A tibble: 80 × 101
   subject group timepoint   age height.T1 weight.T1 sj.max cmj.max lac.125
     <dbl> <chr> <chr>     <dbl>     <dbl>     <dbl>  <dbl>   <dbl>   <dbl>
 1       1 INCR  pre          33       183      80.3   31.0    35.0    1.5 
 2       2 DECR  pre          32       174      71.4   31.6    33.8    1.19
 3       3 INCR  pre          39       193      98.1   26.8    28.8    1.17
 4       4 DECR  pre          37       175      79.2   29.2    30.8    0.88
 5       5 DECR  pre          31       176      88     31.2    25.8    1.06
 6       6 INCR  pre          33       168      79.6   34.2    35.3    1.27
 7       7 MIX   pre          42       180      77.6   30.1    33.0    0.85
 8       8 MIX   pre          26       179      75.5   32.8    33.2    0.93
 9       9 MIX   pre          41       185      82.4   22.7    22.4    1.48
10      10 INCR  pre          35       187      75.6   29.7    31.1    0.93
# ℹ 70 more rows
# ℹ 92 more variables: lac.175 <dbl>, lac.225 <dbl>, lac.250 <dbl>,
#   lac.275 <dbl>, lac.300 <dbl>, lac.325 <dbl>, lac.350 <dbl>, lac.375 <dbl>,
#   VO2.125 <dbl>, VO2.175 <dbl>, VO2.225 <dbl>, VO2.250 <dbl>, VO2.275 <dbl>,
#   VO2.300 <dbl>, VO2.325 <dbl>, VO2.350 <dbl>, VO2.375 <dbl>, VCO2.125 <dbl>,
#   VCO2.175 <dbl>, VCO2.225 <dbl>, VCO2.250 <dbl>, VCO2.275 <dbl>,
#   VCO2.300 <dbl>, VCO2.325 <dbl>, VCO2.350 <dbl>, VCO2.375 <dbl>, …

Select columns based on variable names

  • starts_with(), ends_with() and contains() helps us select columns with repeating patterns.
cyclingstudy %>%
 select(starts_with("lac.")) %>%
 print()
# A tibble: 80 × 11
   lac.125 lac.175 lac.225 lac.250 lac.275 lac.300 lac.325 lac.350 lac.375
     <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
 1    1.5     1.86    2.38    3.54    6.21   NA      NA      NA         NA
 2    1.19    1.49    2.34    3.21    5.33   NA      NA      NA         NA
 3    1.17    1.52    1.22    1.54    2.04    3.32    4.72   NA         NA
 4    0.88    0.99    2.13    3.25   NA       6.15   NA      NA         NA
 5    1.06    1.41    1.9     2.04    3.04    3.59    4.73   NA         NA
 6    1.27    1.73    3.21    4.83   NA      NA      NA      NA         NA
 7    0.85    0.84    1.16    1.71    3.33    6.25   NA      NA         NA
 8    0.93    1.34    1.94   NA       3.71    7.29   NA      NA         NA
 9    1.48    1.17    1.95   NA       3.24    6.21   NA      NA         NA
10    0.93    0.87    0.86    0.92    1.2     1.69    2.6     4.69      NA
# ℹ 70 more rows
# ℹ 2 more variables: lac.1min <dbl>, lac.14min <dbl>

Select columns based on variable names

  • starts_with(), ends_with() and contains() helps us select columns with repeating patterns.
cyclingstudy %>%
 select(ends_with("max")) %>%
 print()
# A tibble: 80 × 8
   sj.max cmj.max VO2.max RER.max VE.max HF.max RPE.B.max RPE.L.max
    <dbl>   <dbl>   <dbl>   <dbl>  <dbl>  <dbl>     <dbl>     <dbl>
 1   31.0    35.0   5629     1.16    227    186        19        19
 2   31.6    33.8   4471     1.19    173    197        19        19
 3   26.8    28.8   5598.    1.1     221    188        19        20
 4   29.2    30.8   4944.    1.1     166    178        19        19
 5   31.2    25.8   5748     1.13    235    186        19        20
 6   34.2    35.3   4633.    1.18    192    182        19        20
 7   30.1    33.0   4250     1.12    168    183        19        20
 8   32.8    33.2   4760.  110       170    192        20        20
 9   22.7    22.4   4629     1.18    223    181        20        20
10   29.7    31.1   5226     1.17    171    180        19        19
# ℹ 70 more rows

Select columns based on variable names

  • starts_with(), ends_with() and contains() helps us select columns with repeating patterns.
cyclingstudy %>%
 select(contains("VE")) %>%
 print()
# A tibble: 80 × 10
   VE.125 VE.175 VE.225 VE.250 VE.275 VE.300 VE.325 VE.350 VE.375 VE.max
    <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl> <chr>  <chr>   <dbl>
 1     55     68     80    102    133     NA     NA <NA>   <NA>      227
 2     48     64     85     90    109     NA     NA <NA>   <NA>      173
 3     50     65     79     95    103    108    134 VE     VE        221
 4     50     62     79     92     NA    120     NA <NA>   <NA>      166
 5     61     74     95    102    120    124    150 <NA>   <NA>      235
 6     50     60     80     90    112    138     NA <NA>   <NA>      192
 7     50     63     75     90    112     NA     NA <NA>   <NA>      168
 8     55     68     82     NA    100    120     NA <NA>   <NA>      170
 9     45     61     79     NA    105    138     NA <NA>   <NA>      223
10     48     57     68     76     85     98    100 117    <NA>      171
# ℹ 70 more rows

Select columns based on variable names

  • all_of() and any_of() solves a problem in selecting variables…
my_columns <- c("subject", "age", "cmj.max")

# This will not work
cyclingstudy %>%
 select(my_columns) %>%
 print()

# This works!
cyclingstudy %>%
 select(all_of(my_columns)) %>%
 print()
# A tibble: 80 × 3
   subject   age cmj.max
     <dbl> <dbl>   <dbl>
 1       1    33    35.0
 2       2    32    33.8
 3       3    39    28.8
 4       4    37    30.8
 5       5    31    25.8
 6       6    33    35.3
 7       7    42    33.0
 8       8    26    33.2
 9       9    41    22.4
10      10    35    31.1
# ℹ 70 more rows

Select columns based on variable names

  • any_of() does not give an error when a variable in the vector does not exist.
my_columns <- c("subject", 
                "age", 
                "cmj.max", 
                "another_column")


cyclingstudy %>%
 select(any_of(my_columns)) %>%
 print()
# A tibble: 80 × 3
   subject   age cmj.max
     <dbl> <dbl>   <dbl>
 1       1    33    35.0
 2       2    32    33.8
 3       3    39    28.8
 4       4    37    30.8
 5       5    31    25.8
 6       6    33    35.3
 7       7    42    33.0
 8       8    26    33.2
 9       9    41    22.4
10      10    35    31.1
# ℹ 70 more rows

Select columns based on variable type

# Select character vectors
cyclingstudy %>%
 select(where(is.character)) %>%
 print()

# Select numeric vectors
cyclingstudy %>%
 select(where(is.numeric)) %>%
 print()
# A tibble: 80 × 96
   subject   age height.T1 weight.T1 sj.max cmj.max lac.125 lac.175 lac.225
     <dbl> <dbl>     <dbl>     <dbl>  <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
 1       1    33       183      80.3   31.0    35.0    1.5     1.86    2.38
 2       2    32       174      71.4   31.6    33.8    1.19    1.49    2.34
 3       3    39       193      98.1   26.8    28.8    1.17    1.52    1.22
 4       4    37       175      79.2   29.2    30.8    0.88    0.99    2.13
 5       5    31       176      88     31.2    25.8    1.06    1.41    1.9 
 6       6    33       168      79.6   34.2    35.3    1.27    1.73    3.21
 7       7    42       180      77.6   30.1    33.0    0.85    0.84    1.16
 8       8    26       179      75.5   32.8    33.2    0.93    1.34    1.94
 9       9    41       185      82.4   22.7    22.4    1.48    1.17    1.95
10      10    35       187      75.6   29.7    31.1    0.93    0.87    0.86
# ℹ 70 more rows
# ℹ 87 more variables: lac.250 <dbl>, lac.275 <dbl>, lac.300 <dbl>,
#   lac.325 <dbl>, lac.350 <dbl>, lac.375 <dbl>, VO2.125 <dbl>, VO2.175 <dbl>,
#   VO2.225 <dbl>, VO2.250 <dbl>, VO2.275 <dbl>, VO2.300 <dbl>, VO2.325 <dbl>,
#   VO2.350 <dbl>, VO2.375 <dbl>, VCO2.125 <dbl>, VCO2.175 <dbl>,
#   VCO2.225 <dbl>, VCO2.250 <dbl>, VCO2.275 <dbl>, VCO2.300 <dbl>,
#   VCO2.325 <dbl>, VCO2.350 <dbl>, VCO2.375 <dbl>, VE.125 <dbl>, …

mutate() - Add or overwrite variables

cyclingstudy %>%
 select(subject, 
        weight.T1, 
        VO2.max) %>%
 mutate(VO2.max_kg = VO2.max / weight.T1) %>%
 print()
# A tibble: 80 × 4
   subject weight.T1 VO2.max VO2.max_kg
     <dbl>     <dbl>   <dbl>      <dbl>
 1       1      80.3   5629        70.1
 2       2      71.4   4471        62.6
 3       3      98.1   5598.       57.1
 4       4      79.2   4944.       62.4
 5       5      88     5748        65.3
 6       6      79.6   4633.       58.2
 7       7      77.6   4250        54.8
 8       8      75.5   4760.       63.0
 9       9      82.4   4629        56.2
10      10      75.6   5226        69.1
# ℹ 70 more rows

mutate() - Add or overwrite variables using group-wise operations

  • mutate() can be used to do group-wise operations using .by = var
cyclingstudy %>%
 select(subject, 
        timepoint,
        height.T1) %>%
 filter(timepoint == "meso2") %>%
 print()
# A tibble: 20 × 3
   subject timepoint height.T1
     <dbl> <chr>         <dbl>
 1       1 meso2            NA
 2       2 meso2            NA
 3       3 meso2            NA
 4       4 meso2            NA
 5       5 meso2            NA
 6       6 meso2            NA
 7       7 meso2            NA
 8       8 meso2            NA
 9       9 meso2            NA
10      10 meso2            NA
11      11 meso2            NA
12      13 meso2            NA
13      14 meso2            NA
14      15 meso2            NA
15      16 meso2            NA
16      17 meso2            NA
17      18 meso2            NA
18      19 meso2            NA
19      20 meso2            NA
20      21 meso2            NA
cyclingstudy %>%
 select(subject, 
        timepoint,
        height.T1) %>%
 mutate(height.T1 = mean(height.T1, 
                         na.rm = TRUE), 
        .by = subject) %>%
 filter(timepoint == "meso2") %>%
 print()
# A tibble: 20 × 3
   subject timepoint height.T1
     <dbl> <chr>         <dbl>
 1       1 meso2           183
 2       2 meso2           174
 3       3 meso2           193
 4       4 meso2           175
 5       5 meso2           176
 6       6 meso2           168
 7       7 meso2           180
 8       8 meso2           179
 9       9 meso2           185
10      10 meso2           187
11      11 meso2           168
12      13 meso2           183
13      14 meso2           183
14      15 meso2           178
15      16 meso2           178
16      17 meso2           179
17      18 meso2           186
18      19 meso2           176
19      20 meso2           180
20      21 meso2           175

How to create variables using mutate()?

  • Using mathematical operations +, -, /, *, exp(), log()
  • Conditional creation of values if_else(CONDITION, IF, ELSE)

filter() variables based on their values

  • filter() uses logical statements to create a TRUE/FALSE vector. These are used to retain or filter away values.
my_true_false <- cyclingstudy$group == "INCR" 
my_true_false
TRUE FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE TRUE FALSE TRUE FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE TRUE FALSE TRUE FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE TRUE FALSE TRUE FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE TRUE FALSE
cyclingstudy %>%
        filter(my_true_false) %>%
        select(subject, group) %>%
        print()
# A tibble: 28 × 2
   subject group
     <dbl> <chr>
 1       1 INCR 
 2       3 INCR 
 3       6 INCR 
 4      10 INCR 
 5      15 INCR 
 6      16 INCR 
 7      20 INCR 
 8       1 INCR 
 9       3 INCR 
10       6 INCR 
# ℹ 18 more rows

Logical statements

  • To create a a vector of TRUE and FALSE for filter to work with we may use:
Operator Interpretation
== equal to
!= not equal to
> greater than
< less that
>= greater than or equal to
<= less than or equal to

Logical statement in a filter

cyclingstudy %>%
 filter(timepoint == "pre")        
        

cyclingstudy %>%
 filter(timepoint != "pre")

cyclingstudy %>%
 filter(VO2.max > "5000")


cyclingstudy %>%
 filter(VO2.max >= "5000")